#!/usr/bin/env python3

# Find the list of (binary) packages containing at least a manpage

import gzip
import json
import os

DIST = "testing"
ARCH = "amd64"

manpages = {}

# Find all manpages from the list of files in Contents-*.gz
with open (DIST + '/' + ARCH + '/' + 'Contents-' + ARCH, 'rt', encoding='UTF-8') as f:
    
    # Search man files in the list of files
    # Example line: "usr/share/man/man8/useradd.8.gz admin/passwd"
    # Example line: "usr/share/man/it/man8/useradd.8.gz admin/passwd"
    for line in f:
        line = line.strip (' \t\n')
        
        if line.startswith ('usr/share/man/'):
            # Split line at ' ' and return only non-empty strings
            line = list (filter (None, line.split (' ')))
            
            if len (line) != 2:
                print ('Bad line!')
                exit()
            
            """
            usr/share/man/<lang>/man8/useradd.8.gz admin/passwd
            |                       | |       | |  |     |_____ distro_package
            |                       | |       | |  |___________ distro_section
            |                       | |       | |______________ page_compression
            |                       | |       |________________ page_section_variant
            |                       | |________________________ page_name
            |                       |__________________________ page_section
            |__________________________________________________ page_path
            """
            
            page_path, page_full_name = line[0].rsplit ('/', 1)
            
            # Skip pages in /usr/share/man
            # They should not be here, they should be in /usr/share/man/man*
            if page_path.lower () == 'usr/share/man':
                continue
            
            page_section = page_path[-1]
            distro_section, distro_package = line[1].rsplit ('/', 1)
            page_name, page_section_variant, page_compression = page_full_name.rsplit ('.', 2)
            
            # Detect language
            page_language = 'en' if page_path.startswith ('usr/share/man/man') \
                                 else page_path.split ('/', 4)[3]
            
            # Check if this is a link to another manpage
            # For example pyrogenesis.6 is a link to 0ad.6
            # TODO this doesn't take into account languages!!!
            file_location = 'man/man' + page_section + '/' + page_full_name
            
            # Extract package name
            # /home/user/man/man6/0ad.6.gz => 0ad.6
            page_link = os.path.realpath (file_location).rsplit ('.', 1)[0].rsplit ('/', 1)[-1] \
                        if os.path.islink (file_location) else None
            
            # Store our manpages. A package can have multiple pages
            if distro_package not in manpages.keys ():
                manpages[distro_package] = []
            
            manpages[distro_package].append ({
                'file':            line[0],
                'link_to':         page_link,
                'path':            page_path,
                'identifier':      page_name + '.' + page_section_variant,
                'full_name':       page_full_name,
                'name':            page_name,
                'section':         page_section,
                'section_varinat': page_section_variant,
                'compression':     page_compression,
                'language':        page_language
            })

packages = []

# Extract list of packages that contain at least one manpage
with open (DIST + '/' + ARCH + '/' + 'Packages', 'rt', encoding='UTF-8') as f:
    
    package = {
        'name':         None,
        'version':      None,
        'architecture': None,
        'section':      None,
        'filename':     None,
        'deb':          None
    }
    
    for line in f:
        line = line.strip (' \t\n')
        
        if line.startswith ('Package:'):
            package['name'] = line.split (' ', 2)[1].strip ()
        
        if line.startswith ('Version:'):
            package['version'] = line.split (' ', 2)[1].strip ()
        
        if line.startswith ('Architecture:'):
            package['architecture'] = line.split (' ', 2)[1].strip ()
        
        if line.startswith ('Section:'):
            package['section'] = line.split (' ', 2)[1].strip ()
        
        if line.startswith ('Filename:'):
            package['filename'] = line.split (' ', 2)[1].strip ()
            package['deb'] = os.path.basename (package['filename'])
        
        # Empty line defines the end of a package metadata
        if line == "":
            # Does this package have manpages?
            if package['name'] in manpages.keys ():
                # Make sure package has all properties
                if None not in package.keys ():
                    # Add manpages belonging to this package
                    package['manpages'] = manpages[package['name']]
                    
                    packages.append (package.copy ())
            
            # Reset package dictionary
            for key in package.keys ():
                package[key] = None

# The packages to download from the mirror
with open ('packages.url', 'wt') as fp:
    for package in packages:
        fp.write ("http://mi.mirror.garr.it/mirrors/debian/" + package['filename'] + "\n")

# Dump list of packages with their manpages
with open ('manpages.json', 'wt') as fp:
    fp.write (json.dumps (packages, sort_keys=True, indent=4))
